home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
The Works of John Ruskin
/
The Works of John Ruskin on CD-ROM.iso
/
bin
/
ctypes.cpl
< prev
next >
Wrap
Text File
|
1995-10-02
|
6KB
|
245 lines
####################################################################
#
#
# File: ctypes.def
#
# Personal Library Software, July, 1993
# Tom Donaldson
#
# Function: Tokenizer definitional data for table driven tokenizer.
# This file defines a vanilla isalnum() type of tokenization. All
# lower case letters are uppercased by canonizer.
#
# The CplTabledRomanceTokenizer allows customization of tokenization by
# editing rules that define the operation of the tokenizer. Central
# concept is "word continuation" rules, defining characters-kinds that
# CANNOT be split from each other.
#
# History
# -------
#
# 27aug93 tomd Created from tknztbld.def
#
####################################################################
####################################################################
#
# Installation
# ============
#
# Database.def File
# -----------------
#
# To use the CplTabledRomanceTokenizer, you need this line in the .def
# file for the database:
#
# TOKENIZER = CplTabledRomanceTokenizer
#
#
# Tokenizer File
# --------------
#
# This file, ctypes.def, is the rule file. The tokenizer REQUIRES that
# its definition file be named "tknztbld.def". Therefore, you MUST copy
# this file as "tknztbld.def". The "tknztbld.def" file MUST be in the
# "home directory" of the database using the tokenizer, or the "system"
# directory for the CPL installation.
#
# Note that a tknztbld.def in the database's home directory takes
# precedence over a tknztbld.def in the CPL "system" directory.
#
#
####################################################################
####################################################################
#
# Section 1: Character Class Definitions
#
####################################################################
The only rule needed for this C-type isalnum() style of tokenization
is a "letter" rule. All characters that can take part in a token must
be classified as a "letter". Such "letter" characters will be
unconditionally included in tokens, and "letter" characters will be
unconditionally considered inseparable.
# Name
# ----
Letter
EndRule
####################################################################
#
# Section 2: Character Classification Map
#
####################################################################
# ------- ----- -----------------------
# Decimal Class
# Value Name Comment
# ------- ----- -----------------------
# Digits: Note that they are classified as Letter, which is the only
# character class defined.
48 Letter # Char '0'
49 Letter # Char '1'
50 Letter # Char '2'
51 Letter # Char '3'
52 Letter # Char '4'
53 Letter # Char '5'
54 Letter # Char '6'
55 Letter # Char '7'
56 Letter # Char '8'
57 Letter # Char '9'
# Upper case letters:
65 Letter # Char 'A'
66 Letter # Char 'B'
67 Letter # Char 'C'
68 Letter # Char 'D'
69 Letter # Char 'E'
70 Letter # Char 'F'
71 Letter # Char 'G'
72 Letter # Char 'H'
73 Letter # Char 'I'
74 Letter # Char 'J'
75 Letter # Char 'K'
76 Letter # Char 'L'
77 Letter # Char 'M'
78 Letter # Char 'N'
79 Letter # Char 'O'
80 Letter # Char 'P'
81 Letter # Char 'Q'
82 Letter # Char 'R'
83 Letter # Char 'S'
84 Letter # Char 'T'
85 Letter # Char 'U'
86 Letter # Char 'V'
87 Letter # Char 'W'
88 Letter # Char 'X'
89 Letter # Char 'Y'
90 Letter # Char 'Z'
# Lower case letters:
97 Letter # Char 'a'
98 Letter # Char 'b'
99 Letter # Char 'c'
100 Letter # Char 'd'
101 Letter # Char 'e'
102 Letter # Char 'f'
103 Letter # Char 'g'
104 Letter # Char 'h'
105 Letter # Char 'i'
106 Letter # Char 'j'
107 Letter # Char 'k'
108 Letter # Char 'l'
109 Letter # Char 'm'
110 Letter # Char 'n'
111 Letter # Char 'o'
112 Letter # Char 'p'
113 Letter # Char 'q'
114 Letter # Char 'r'
115 Letter # Char 's'
116 Letter # Char 't'
117 Letter # Char 'u'
118 Letter # Char 'v'
119 Letter # Char 'w'
120 Letter # Char 'x'
121 Letter # Char 'y'
122 Letter # Char 'z'
# --- ----- -----------------------
-1 EndOfDefs # Not loaded. Just marks end of map definition.
# --- ----- -----------------------
####################################################################
#
# Section 3: Word Continuation Rules
#
####################################################################
# There is only one rule. Letter characters cannot be separated from
each other, ever, and only Letter characters can be in tokens.
Letter *
EndRule
####################################################################
#
# Section 4: Canonization Map
#
####################################################################
# ------- ------- -----------
# Input Output
# Decimal Decimal
# Char Char
# Value Value Comment
# ------- ------- -----------
#
# Map the characters a-z to the "canonical" characters A-Z. That is,
# all letters will be upper cased.
97 65 # Char 'a' canonizes to 'A'
98 66 # Char 'b' canonizes to 'B'
99 67 # Char 'c' canonizes to 'C'
100 68 # Char 'd' canonizes to 'D'
101 69 # Char 'e' canonizes to 'E'
102 70 # Char 'f' canonizes to 'F'
103 71 # Char 'g' canonizes to 'G'
104 72 # Char 'h' canonizes to 'H'
105 73 # Char 'i' canonizes to 'I'
106 74 # Char 'j' canonizes to 'J'
107 75 # Char 'k' canonizes to 'K'
108 76 # Char 'l' canonizes to 'L'
109 77 # Char 'm' canonizes to 'M'
110 78 # Char 'n' canonizes to 'N'
111 79 # Char 'o' canonizes to 'O'
112 80 # Char 'p' canonizes to 'P'
113 81 # Char 'q' canonizes to 'Q'
114 82 # Char 'r' canonizes to 'R'
115 83 # Char 's' canonizes to 'S'
116 84 # Char 't' canonizes to 'T'
117 85 # Char 'u' canonizes to 'U'
118 86 # Char 'v' canonizes to 'V'
119 87 # Char 'w' canonizes to 'W'
120 88 # Char 'x' canonizes to 'X'
121 89 # Char 'y' canonizes to 'Y'
122 90 # Char 'z' canonizes to 'Z'
# --- ----- -----------------------
-1 -1 # Not loaded. Just marks end of map definition.
# --- ----- -----------------------
####################################################################
#
#
# End Of File: ctypes.def
#
#
####################################################################